In [1]:
import pandas as pd
from pandas.tseries.offsets import DateOffset
import requests
from bs4 import BeautifulSoup
import json
from collections import Counter
from matplotlib import pyplot as plt
from datetime import datetime
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import re
import nltk

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
In [3]:
import pandas as pd
import requests
import re
from datetime import datetime

def get_latest_cleaned_csv_url(user, repo, path="data/cleaned"):
    api_url = f"https://api.github.com/repos/{user}/{repo}/contents/{path}"
    response = requests.get(api_url)
    files = response.json()

    csv_files = []
    for file in files:
        name = file['name']
        if name.startswith("retail_sugar_prices_") and name.endswith(".csv"):
            match = re.search(r"(\d{4}-\d{2}-\d{2})", name)
            if match:
                csv_files.append((match.group(1), name))

    if not csv_files:
        raise ValueError("❌ No cleaned CSV files found.")

    # Get latest date
    latest_date, latest_file = sorted(csv_files)[-1]
    print(f"📁 Latest file found: {latest_file}")

    raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/main/{path}/{latest_file}"
    return raw_url

# === USE IT ===
user = "Neeti3107"
repo = "Foundation-Project_Group-14"
latest_csv_url = get_latest_cleaned_csv_url(user, repo)

# Load the file
df_filtered = pd.read_csv(latest_csv_url, parse_dates=['date'])
df_filtered.head()
📁 Latest file found: retail_sugar_prices_2025-04-19.csv
Out[3]:
date admin1 admin2 market market_id latitude longitude category commodity commodity_id unit priceflag pricetype currency price usdprice
0 1994-01-15 Delhi Delhi Delhi 934 28.67 77.22 miscellaneous food sugar 97 KG actual retail INR 13.50 0.43
1 1994-01-15 Himachal Pradesh Shimla Shimla 963 31.10 77.17 miscellaneous food sugar 97 KG actual retail INR 13.75 0.44
2 1994-01-15 Karnataka Bangalore Urban Bengaluru 926 12.96 77.58 miscellaneous food sugar 97 KG actual retail INR 13.20 0.42
3 1994-01-15 Maharashtra Mumbai city Mumbai 955 18.98 72.83 miscellaneous food sugar 97 KG actual retail INR 13.80 0.44
4 1994-01-15 Telangana Hyderabad Hyderabad 941 17.38 78.47 miscellaneous food sugar 97 KG actual retail INR 13.00 0.41
In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- EDA and Visualizations ---
# Step 1: Sort and create a copy
df_eda = df_filtered[['date', 'price']].copy()
df_eda = df_eda.sort_values('date')

# Step 2: Add Month and Year columns
df_eda['year'] = df_eda['date'].dt.year
df_eda['month'] = df_eda['date'].dt.to_period('M')

# Step 3: Percentage change
df_eda['pct_change'] = df_eda['price'].pct_change() * 100

# Step 4: Monthly average price
df_monthly = df_eda.groupby('month')['price'].mean().reset_index()
df_monthly['month'] = df_monthly['month'].dt.to_timestamp()

# Step 5: Line plot of monthly average price
plt.figure(figsize=(14, 6))
sns.lineplot(data=df_monthly, x='month', y='price', marker='o')
plt.title("Monthly Average Sugar Price (INR)")
plt.xlabel("Month")
plt.ylabel("Average Price (INR)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 6: Monthly percentage change
df_monthly['pct_change'] = df_monthly['price'].pct_change() * 100

plt.figure(figsize=(14, 6))
sns.barplot(data=df_monthly, x='month', y='pct_change', color='orange')
plt.title("Monthly Percentage Change in Sugar Price")
plt.xlabel("Month")
plt.ylabel("Percentage Change")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


# Step 7: Price histogram
plt.figure(figsize=(10, 6))
sns.histplot(df_eda['price'], bins=30, kde=True)
plt.title("Distribution of Sugar Prices (Retail)")
plt.xlabel("Price (INR)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 8: Boxplot by year
plt.figure(figsize=(14, 6))
sns.boxplot(x='year', y='price', data=df_eda)
plt.title("Year-wise Distribution of Sugar Prices (Retail)")
plt.xlabel("Year")
plt.ylabel("Price (INR)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 9: Heatmap of average price by year and month
df_eda['month_num'] = df_eda['date'].dt.month
pivot_table = df_eda.pivot_table(values='price', index='year', columns='month_num', aggfunc='mean')

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='YlGnBu', annot=True, fmt=".1f")
plt.title("Monthly Sugar Prices Heatmap (Retail, INR)")
plt.xlabel("Month")
plt.ylabel("Year")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [5]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd

# Ensure 'month' is datetime and sorted
df_monthly = df_monthly.sort_values('month')

# Plot using Matplotlib to keep datetime on x-axis
fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(df_monthly['month'], df_monthly['pct_change'], color='orange')

# Formatting
ax.set_title("Monthly Percentage Change in Sugar Price")
ax.set_xlabel("Year")
ax.set_ylabel("Percentage Change")

# Set x-axis ticks every 5 years from Jan 1994
locator = mdates.YearLocator(base=5, month=1, day=1)
formatter = mdates.DateFormatter('%Y')
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

# Set limits from Jan 1994 to last available date
start_date = pd.Timestamp('1994-01-01')
end_date = df_monthly['month'].max()
ax.set_xlim([start_date, end_date])

plt.xticks(rotation=0)
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

# Create time series with datetime index
df_decompose = df_monthly[['month', 'price']].copy()
df_decompose.set_index('month', inplace=True)
df_decompose = df_decompose.sort_index()

# Optional: Interpolate or drop missing values
df_decompose = df_decompose.interpolate()

# Specify period manually (12 for monthly data)
decompose_result = seasonal_decompose(df_decompose['price'], model='additive', period=12)

# Plot the full decomposition
decompose_result.plot()
plt.suptitle("Seasonal Decomposition of Sugar Prices (Additive Model)", fontsize=16)
plt.tight_layout()
plt.show()

# Plot trend separately
plt.figure(figsize=(16, 6))
decompose_result.trend.plot(color='orange', title='Trend Component')
plt.grid(True)
plt.show()

# Plot seasonality separately
plt.figure(figsize=(16, 6))
decompose_result.seasonal.plot(color='green', title='Seasonal Component')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

📊 Insights from EDA of Retail Sugar Prices (1994–2025)¶


🧭 1. Overall Trends (Long-Term Behavior)¶

  • Sugar prices in India have steadily increased over the last 30 years.
  • From around ₹8/kg in 1994, prices have climbed to over ₹45/kg by recent years.
  • A strong upward trend is clearly visible in the trend decomposition plot, especially during:
    • 2009–2011: Noticeable inflationary spike in sugar prices.
    • 2020–2022: Gradual increase likely influenced by pandemic-era disruptions.

📈 2. Monthly Average & Seasonality¶

  • There is seasonal fluctuation in sugar prices, repeating every year.
  • Prices tend to drop in early months (April–May) and rise again toward the year-end.
  • This pattern suggests:
    • A possible link to agricultural harvest cycles or festive demand variations.
    • E.g., Diwali or end-of-year celebrations may create demand surges.

🧮 3. Monthly Percentage Change¶

  • Most months show mild percentage changes (<5%), but a few months spike or drop sharply:
    • These may correspond to policy changes, import/export controls, or supply shocks.
    • Sudden changes often align with known economic events or weather-related impacts.

📦 4. Distribution Insights¶

  • The most common sugar price historically was ₹8, which occurred 21 times — mostly in early years.
  • Overall, sugar prices follow a right-skewed distribution with a concentration of values between ₹20–₹40/kg.
  • This skew indicates gradual but consistent inflation in consumer sugar pricing.

📅 5. Year-wise Variability¶

  • The boxplot shows wider price ranges in later years (post-2010), suggesting:
    • Increased volatility in the market.
    • Possibly driven by global market influences, fuel costs, or climate-driven variability.

🌡️ 6. Heatmap View¶

  • The heatmap confirms a repeating seasonal cycle:
    • Sugar prices are lower in mid-year months and higher towards the end/start of each year.
    • 2009, 2016, and 2020 stand out with unusual spikes, possibly due to external shocks.

In [7]:
# Check Which States Have the Most Complete Sugar Price Data

# Add year-month for aggregation
df_filtered['year_month'] = df_filtered['date'].dt.to_period('M').dt.to_timestamp()

# Count how many months of data are available per state
state_monthly_counts = (
    df_filtered.groupby(['admin1', 'year_month'])['price']
    .count()
    .reset_index(name='count')
)

# Pivot to create heatmap-like matrix: rows=state, columns=month, values=presence (0/1)
state_presence = state_monthly_counts.copy()
state_presence['present'] = 1  # Mark available data
heatmap_df = state_presence.pivot(index='admin1', columns='year_month', values='present').fillna(0)

# Optional: sort states by number of months available
heatmap_df['total_months'] = heatmap_df.sum(axis=1)
heatmap_df = heatmap_df.sort_values('total_months', ascending=False).drop(columns='total_months')

# Plot heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_df, cmap='Greens', cbar=False, linewidths=0.1)
plt.title("📊 State-wise Monthly Data Completeness for Retail Sugar Prices")
plt.xlabel("Year-Month")
plt.ylabel("State (admin1)")
plt.tight_layout()
plt.show()

# Print summary table: states with most and least data
summary = (
    df_filtered.groupby('admin1')['year_month']
    .nunique()
    .sort_values(ascending=False)
    .reset_index(name='months_of_data')
)
print("Top states by data completeness:\n", summary.head(10))
print("\nBottom states by data completeness:\n", summary.tail(10))
C:\Users\neeti\AppData\Local\Temp\ipykernel_1876\757177853.py:28: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
  plt.tight_layout()
C:\Users\neeti\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
Top states by data completeness:
              admin1  months_of_data
0       Maharashtra             264
1        Tamil Nadu             263
2            Orissa             255
3         Rajasthan             249
4             Bihar             243
5         Karnataka             241
6    Madhya Pradesh             239
7            Kerala             238
8     Uttar Pradesh             237
9  Himachal Pradesh             235

Bottom states by data completeness:
                  admin1  months_of_data
21          Uttarakhand             128
22       Andhra Pradesh             116
23             Nagaland             100
24           Chandigarh              99
25  Andaman and Nicobar              99
26           Puducherry              83
27                  Goa              78
28         Chhattisgarh              33
29               Sikkim              24
30              Manipur              13
In [8]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Create pivot table (rows = date, columns = states)
pivot_prices = (
    df_filtered.groupby(['year_month', 'admin1'])['price']
    .mean()
    .unstack()
)

# Step 2: Interpolate missing values (linear within time)
pivot_prices_interp = pivot_prices.interpolate(method='linear', limit_direction='both')

# Optional: Only keep states with at least 180 months (15 years) of data
valid_states = pivot_prices_interp.columns[pivot_prices_interp.notna().sum() >= 180]
pivot_prices_interp = pivot_prices_interp[valid_states]

# Step 3: Standardize (each state's time series gets mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pivot_prices_interp.fillna(0).T)  # Transpose: states as rows

# Step 4: Apply KMeans clustering (you can tune 'n_clusters')
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to state names
cluster_df = pd.DataFrame({
    'state': pivot_prices_interp.columns,
    'cluster': clusters
}).sort_values('cluster')

print(cluster_df)

# Step 5: Optional PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='Set2', s=100)
for i, state in enumerate(pivot_prices_interp.columns):
    plt.text(X_pca[i, 0], X_pca[i, 1], state, fontsize=9, ha='right')
plt.title("📍 State Clustering by Sugar Price Pattern (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\neeti\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\neeti\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
                  state  cluster
15          Maharashtra        0
28        Uttar Pradesh        0
26            Telangana        0
25           Tamil Nadu        0
23            Rajasthan        0
20               Orissa        0
14       Madhya Pradesh        0
13               Kerala        0
12            Karnataka        0
11            Jharkhand        0
30          West Bengal        0
8               Gujarat        0
6                 Delhi        0
2                 Assam        0
3                 Bihar        0
4            Chandigarh        0
10     Himachal Pradesh        0
0   Andaman and Nicobar        1
16              Manipur        1
24               Sikkim        1
1        Andhra Pradesh        2
22               Punjab        2
9               Haryana        2
29          Uttarakhand        2
7                   Goa        2
21           Puducherry        2
5          Chhattisgarh        2
19             Nagaland        3
18              Mizoram        3
17            Meghalaya        3
27              Tripura        3
C:\Users\neeti\AppData\Local\Temp\ipykernel_1876\1823660468.py:49: UserWarning: Glyph 128205 (\N{ROUND PUSHPIN}) missing from current font.
  plt.tight_layout()
C:\Users\neeti\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 128205 (\N{ROUND PUSHPIN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image

🧠 Cluster Interpretations¶


✅ Cluster 0 – Majority Group (Stable/Aligned Trend)¶

States: Maharashtra, Uttar Pradesh, Tamil Nadu, Rajasthan, Karnataka, Kerala, Madhya Pradesh, Gujarat, etc.

  • These states show similar seasonal patterns and consistent data availability.
  • Likely follow national sugar price trends.
  • Suitable for country-level modeling or selecting a representative sample group.

🌴 Cluster 1 – Sparse/Irregular or Island Territories¶

States: Andaman & Nicobar, Manipur, Sikkim

  • Often have sparse data or irregular price patterns.
  • May have unique supply chains (e.g., non-agricultural or import-dependent).
  • Not ideal for standard trend modeling without adjustment.

⚡ Cluster 2 – Semi-distinct / Volatile Trends¶

States: Andhra Pradesh, Punjab, Haryana, Goa, Puducherry, Chhattisgarh

  • Show greater price volatility or regional fluctuations.
  • Could be influenced by local governance, infrastructure, or supply-demand issues.
  • May need separate forecasting models or volatility handling.

🏔️ Cluster 3 – Northeast Focused Outliers¶

States: Nagaland, Mizoram, Meghalaya, Tripura

  • Exhibit distinctive pricing behavior compared to mainland states.
  • Influenced by transport costs, geography, or border trade policies.
  • Important to treat as a unique regional group in analysis.

In [9]:
pip install plotly geopandas
Requirement already satisfied: plotly in c:\users\neeti\anaconda3\lib\site-packages (5.22.0)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: geopandas in c:\users\neeti\anaconda3\lib\site-packages (1.0.1)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\neeti\anaconda3\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: packaging in c:\users\neeti\anaconda3\lib\site-packages (from plotly) (23.2)
Requirement already satisfied: numpy>=1.22 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (1.26.4)
Requirement already satisfied: pyogrio>=0.7.2 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (0.10.0)
Requirement already satisfied: pandas>=1.4.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (2.2.2)
Requirement already satisfied: pyproj>=3.3.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (3.7.1)
Requirement already satisfied: shapely>=2.0.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (2.1.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2023.3)
Requirement already satisfied: certifi in c:\users\neeti\anaconda3\lib\site-packages (from pyogrio>=0.7.2->geopandas) (2024.7.4)
Requirement already satisfied: six>=1.5 in c:\users\neeti\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.4.0->geopandas) (1.16.0)
In [10]:
import pandas as pd
import plotly.express as px
import json
import requests

# Your state-cluster data
state_cluster_df = pd.DataFrame({
    'state': [
        'Maharashtra', 'Uttar Pradesh', 'Telangana', 'Tamil Nadu', 'Rajasthan',
        'Orissa', 'Madhya Pradesh', 'Kerala', 'Karnataka', 'Jharkhand',
        'West Bengal', 'Gujarat', 'Delhi', 'Assam', 'Bihar', 'Chandigarh',
        'Himachal Pradesh', 'Andaman and Nicobar', 'Manipur', 'Sikkim',
        'Andhra Pradesh', 'Punjab', 'Haryana', 'Uttarakhand', 'Goa',
        'Puducherry', 'Chhattisgarh', 'Nagaland', 'Mizoram', 'Meghalaya', 'Tripura'
    ],
    'cluster': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3]
})

# Load India states GeoJSON
# Source: https://github.com/geohacker/india/blob/master/state/india_telengana.geojson
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'  # For US by default
india_geojson_url = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_telengana.geojson'
india_states = requests.get(india_geojson_url).json()

# Fix names to match if needed
state_cluster_df['state'] = state_cluster_df['state'].str.title()

# Plotly Choropleth
fig = px.choropleth_mapbox(
    state_cluster_df,
    geojson=india_states,
    featureidkey='properties.NAME_1',
    locations='state',
    color='cluster',
    color_continuous_scale='Viridis',
    mapbox_style='carto-positron',
    center={"lat": 23.5937, "lon": 80.9629},
    zoom=3.8,
    opacity=0.7,
    title="State-wise Clustering Based on Sugar Price Patterns"
)

fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()
In [11]:
# === 1. Create the notebook folder if not exists
import os  # 👈 required for path operations
import pandas as pd
import os
from datetime import datetime
from pandas_profiling import ProfileReport
import subprocess

repo_path = r"C:\Users\neeti\Documents\ISB_Class of Summer_2025\04 Term 4\Foundation\Foundation-Project_Group-14"
notebook_path = os.path.join(repo_path, "notebooks")
os.makedirs(notebook_path, exist_ok=True)

# === 3. Push to GitHub
os.chdir(repo_path)
try:
    subprocess.run(["git", "add", "notebooks/EDA_Report.ipynb"], check=True)
    subprocess.run(["git", "commit", "-m", "Automated: Added EDA report notebook"], check=True)
    subprocess.run(["git", "push", "origin", "main"], check=True)
    print("🚀 EDA notebook committed and pushed to GitHub.")
except subprocess.CalledProcessError as e:
    print(f"❌ Git push failed: {e}")
C:\Users\neeti\AppData\Local\Temp\ipykernel_1876\4115147682.py:6: DeprecationWarning:

`import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.

🚀 EDA notebook committed and pushed to GitHub.
In [12]:
import subprocess
import os

# === Paths ===
notebook_file = "notebooks/EDA_Report.ipynb"
output_file = "notebooks/eda_report.html"

# === 1. Convert notebook to HTML (hide code cells)
subprocess.run([
    "jupyter", "nbconvert", notebook_file,
    "--to", "html",
    "--output", os.path.basename(output_file),
    "--no-input",
    "--output-dir", os.path.dirname(output_file)
], check=True)

print("✅ Converted notebook to HTML without code.")

# === 2. Git Add, Commit, and Push
try:
    subprocess.run(["git", "add", output_file], check=True)
    subprocess.run(["git", "commit", "-m", "🔁 Automated: Added EDA report (HTML no-code)"], check=True)
    subprocess.run(["git", "push", "origin", "main"], check=True)
    print("🚀 EDA report pushed to GitHub.")
except subprocess.CalledProcessError as e:
    print(f"❌ Git push failed: {e}")
✅ Converted notebook to HTML without code.
🚀 EDA report pushed to GitHub.
In [ ]: